Data Storyteller | Adaptable Collaborator | Synthetic Biologist | Rapid Prototyper
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.linear_model
import holoviews as hv
import colorcet as cc
from health import *
hv.extension('bokeh')
blue = cc.b_glasbey_hv[0]
red = cc.b_glasbey_hv[1]
Drop 'Limited access to healthy foods' (see exploratory_data_analysis - 15% of this data is missing).
health_file = 'California - v14.1 - released May 17th, 2022/CHDB_data_tract_CA_v14.1.txt'
health_data = load_health_data(health_file).drop(['Limited access to healthy foods'], axis=1)
income_data_file = 'california_median_household_income_acs/ACSDT5Y2018.B19013_data_with_overlays_2022-06-23T200015.csv'
income_data = load_hinc_data(income_data_file)
df = pd.merge(health_data, income_data[['median income', 'stcotr_fips']])
First, let's try just dropping missing values.
n = len(df)
print(f'Original number of data points: {n}.')
df.dropna(inplace=True)
print(f'Number of rows dropped: {n - len(df)}, {100 * (n - len(df)) / n:.1f}%.')
Original number of data points: 6075. Number of rows dropped: 407, 6.7%.
df['log10(median income)'] = np.log10(np.array(df['median income']))
column_renaming = {column_name: column_name.replace(' ', '\n') for column_name in df.columns}
ds = hv.Dataset(
data=df.rename(columns=column_renaming),
kdims=['stcotr_fips'],
)
(
hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'median\nincome'] +
hv.operation.gridmatrix(ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).cols(
1
)
We will use ridge regression with built-in cross-validation, provided by the sklearn library.
X = df.drop(
['median income', 'log10(median income)', 'stcotr_fips'], axis=1
)
y = np.log10(
np.array(df['median income']).reshape(-1, 1)
)
transformer = sklearn.preprocessing.RobustScaler()
transformer.fit(X)
X_scaled = transformer.transform(X)
model = sklearn.linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model.fit(X_scaled, y)
RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))
model.alpha_
1.0
train_score = model.score(X_scaled, y)
train_score
0.9490166406907403
y_pred = model.predict(X_scaled)
plot = plot_prediction_vs_actual(y_pred, y, 'median income',log10=True)
plot.opts(
hv.opts.AdjointLayout(title=f'score: {train_score:.2f}'),
hv.opts.Points(alpha=0.4)
)
metrics = X.columns
metrics_sorted = np.array(metrics)[np.argsort(model.coef_)]
metrics_sorted = metrics_sorted.tolist()[0]
weights = model.coef_.copy()
weights.sort()
weights = weights.tolist()[0]
df_weights = pd.DataFrame({'metrics': metrics_sorted, 'weights': weights})
feature_weights = hv.Bars(df_weights, kdims='metrics')
feature_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
metrics = X.columns
metrics_abs_sorted = np.array(metrics)[np.argsort(np.abs(model.coef_))]
metrics_abs_sorted = metrics_abs_sorted.tolist()[0]
abs_weights = model.coef_.copy()
abs_weights = np.abs(abs_weights)
abs_weights.sort()
abs_weights = abs_weights.tolist()[0]
df_abs_weights = pd.DataFrame({'metrics': metrics_abs_sorted, 'weights (absolute value)': abs_weights})
feature_abs_weights = hv.Bars(df_abs_weights, kdims='metrics')
feature_abs_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
Although we achieved a high score on fitting to training data, we can see that 'Income Inequality' is the dominant feature, which is sort of cheating.
X = df.drop(
['median income', 'log10(median income)', 'Income Inequality', 'stcotr_fips'], axis=1
)
y = np.log10(
np.array(df['median income']).reshape(-1, 1)
)
transformer = sklearn.preprocessing.RobustScaler()
transformer.fit(X)
X_scaled = transformer.transform(X)
model = sklearn.linear_model.RidgeCV(alphas=np.logspace(-6, 6, 13))
model.fit(X_scaled, y)
/Users/lealia/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py:1791: RuntimeWarning: invalid value encountered in reciprocal w = ((singvals_sq + alpha) ** -1) - (alpha ** -1)
RidgeCV(alphas=array([1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01,
1.e+02, 1.e+03, 1.e+04, 1.e+05, 1.e+06]))
model.alpha_
1.0
train_score = model.score(X_scaled, y)
train_score
0.8858283254741887
y_pred = model.predict(X_scaled)
plot = plot_prediction_vs_actual(y_pred, y, 'median income',log10=True)
plot.opts(
hv.opts.AdjointLayout(title=f'score: {train_score:.2f}'),
hv.opts.Points(alpha=0.4)
)
We consistently underpredict high household incomes. This may be because the highest incomes were labeled as '250,000+', and I replaced that label with 250000. There are other ways to address this that I haven't implemented.
metrics = X.columns
metrics_sorted = np.array(metrics)[np.argsort(model.coef_)]
metrics_sorted = metrics_sorted.tolist()[0]
weights = model.coef_.copy()
weights.sort()
weights = weights.tolist()[0]
df_weights = pd.DataFrame({'metrics': metrics_sorted, 'weights': weights})
feature_weights = hv.Bars(df_weights, kdims='metrics')
feature_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
metrics = X.columns
metrics_abs_sorted = np.array(metrics)[np.argsort(np.abs(model.coef_))]
metrics_abs_sorted = metrics_abs_sorted.tolist()[0]
abs_weights = model.coef_.copy()
abs_weights = np.abs(abs_weights)
abs_weights.sort()
abs_weights = abs_weights.tolist()[0]
df_abs_weights = pd.DataFrame({'metrics': metrics_abs_sorted, 'weights (absolute value)': abs_weights})
feature_abs_weights = hv.Bars(df_abs_weights, kdims='metrics')
feature_abs_weights.opts(
frame_width=800,
frame_height=400,
xrotation=45,
)
These metrics are much more evenly weighted.
import json
with open('metrics_sorted_by_weight.json', 'w') as f:
metrics_json = json.dumps(metrics_abs_sorted[::-1])
json.dump(metrics_json, f)
pearson_correlations = np.zeros(len(metrics))
for i, metric in enumerate(metrics):
corr = df['median income'].corr(df[metric])
pearson_correlations[i] = corr
df_corr = pd.DataFrame({'metrics': metrics, 'correlation': pearson_correlations.tolist()})
corr_bars = hv.Bars(df_corr, kdims='metrics')
df_weights_unsorted = pd.DataFrame({'metrics': metrics, 'weights': model.coef_.tolist()[0]})
feature_weights = hv.Bars(df_weights_unsorted, kdims='metrics')
(
feature_weights.opts(
frame_width=800,
frame_height=200,
xaxis=None,
) +
corr_bars.opts(
frame_width=800,
frame_height=200,
xrotation=45,
color='orange'
)
).cols(
1
)
Can we use what we learned in California to predict median income in Colorado?
test_health_file = 'Colorado - v14.1 - released May 17th, 2022/CHDB_data_tract_CO_v14.1.txt'
test_health_data = load_health_data(test_health_file).drop(['Limited access to healthy foods'], axis=1)
test_income_data_file = 'colorado_median_household_income_acs/ACSDT5Y2018.B19013_data_with_overlays_2022-06-23T225911.csv'
test_income_data = load_hinc_data(test_income_data_file)
test_df = pd.merge(test_health_data, test_income_data[['median income', 'stcotr_fips']])
n = len(test_df)
print(f'Original number of data points: {n}.')
test_df.dropna(inplace=True)
print(f'Number of rows dropped: {n - len(test_df)}, {100 * (n - len(test_df)) / n:.1f}%.')
Original number of data points: 804. Number of rows dropped: 111, 13.8%.
X_test = test_df.drop(
['median income', 'Income Inequality', 'stcotr_fips'], axis=1
)
y_test_actual = np.log10(
np.array(test_df['median income']).reshape(-1, 1)
)
transformer = sklearn.preprocessing.RobustScaler()
transformer.fit(X)
X_test_scaled = transformer.transform(X_test)
y_test_pred = model.predict(X_test_scaled)
test_score = model.score(X_test_scaled, y_test_actual)
test_score
0.49248220210869775
plot_test = plot_prediction_vs_actual(y_test_pred, y_test_actual, label='median income', log10=True)
plot_test.opts(
hv.opts.Layout(title=f'score: {test_score:.2f}'),
hv.opts.Points(color=red, alpha=0.4),
hv.opts.Distribution(color=red)
)
The model scored quite poorly on Colorado data -- but, it looks like there might be some systematic error...
(
hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_actual, kdims=['log10(income)'], label='Colorado')
).opts(
legend_position='right',
frame_width=400,
frame_height=300,
)
Indeed, the whole distribution of Colorado incomes is shifted to the left of California incomes, which may contribute to why our model predicts a higher income than the true value consistently.
colorado_state_median = np.median(y_test_actual)
colorado_state_median
4.825185858559518
california_state_median = np.median(y)
california_state_median
4.850499199426245
delta = colorado_state_median - california_state_median
y_test_adj = y_test_actual - delta
(
hv.Distribution(y, kdims=['log10(income)'], label='California') * hv.Distribution(y_test_adj, kdims=['log10(income)'], label='Colorado (adjusted to California median)')
).opts(
legend_position='right',
frame_width=400,
frame_height=300,
)
test_score = model.score(X_test_scaled, y_test_adj)
test_score
0.642861281963671
plot_test_adj = plot_prediction_vs_actual(y_test_pred, y_test_adj, label='median income', log10=True)
plot_test_adj.opts(
hv.opts.Layout(title=f'score: {test_score:.2f}'),
hv.opts.Points(color=red, alpha=0.4),
hv.opts.Distribution(color=red)
)
We can still observe some systematic bias - most predictions are still too high. However, we do see an improvement in the model score after applying the systematic correction.
test_df['log10(median income)'] = np.log10(np.array(test_df['median income']))
column_renaming = {column_name: column_name.replace(' ', '\n') for column_name in test_df.columns}
test_ds = hv.Dataset(
data=test_df.rename(columns=column_renaming),
kdims=['stcotr_fips'],
)
(
hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'median\nincome'] +
hv.operation.gridmatrix(test_ds, chart_type=hv.Points)[:, 'log10(median\nincome)']
).opts(
hv.opts.Points(color=red),
hv.opts.Histogram(color=red),
).cols(
1
)